Quick and dirty first-pass DEG expression analysis for expt. SCL000091

Data Setup

Please copy over the SCL000091.Gene_Expression_Count.txt and the SCL000091_Sample_Metadata.csv files to your home directory that is accessible to the GRP workbench. The /B32NGS/Projects mount where the original files reside isn’t accessible from this server. I’ve copied over my files to this location: /fcrbiouatappn01/home/ssunkara1/SCL000091

My version of the metadata.csv has an additional column ‘grp’ to group samples into the 4 categories: WT, WT-SN38, S, S-SN38

Install DESeq2, Org.Hs.eg.db, clusterProfiler, a few plotting packages, zeallot, fgsea and devtools packages

load packages

read datasets into variables; please substitute paths to your files accordingly

count_data <- as.matrix(read.csv("/fcrbiouatappn01/home/ssunkara1/SCL000091/SCL000091.Gene_Expression_Count.txt",sep="\t",row.names="Gene_ID"))

metadata <- read.csv("/fcrbiouatappn01/home/ssunkara1/SCL000091/SCL000091_Sample_Metadata.csv",sep=",")

typecast grp variable as a factor and populate coldat

coldat <- data.frame(grp=factor(metadata$grp))

Create a DESeqDataSet object from the count matrix using the DESeqDataSetFromMatrix function Arbitrarily filter genes where the cumulative read count over all samples is <=10 (cumreadcount). Cumreadcount cut-off is empirical, and should be further tested. Run the DESeq function on the DESeqDataSet object which fits a GLM model based on the Negative Binomial distribution. DESeq also returns a dds (DESeqDataSetFromMatrix) object.

cumreadcount <- 10

ddsMat <- DESeqDataSetFromMatrix(countData = count_data, colData=coldat, design = ~ grp)

keep <- rowSums(counts(ddsMat)) > cumreadcount

dds <- ddsMat[keep,]

dds <- DESeq(dds)

PCA analysis to look for batch effects in the samples Variance stabilizing transformation (vst) is used for sample clustering. blind = TRUE option looks for across sample variability.

Extract the GLM results from the dds object & run comparisons Comparisons of interest between WT and WT-SN38, and then between S and S-SN38 from the fitted model are done using the contrasts argument to the results function, on the dds object.

res_WT_WTSN38 <- results(dds, contrast=c("grp", "WT", "WT-SN38"), tidy = TRUE)

res_S_SSN38 <- results(dds, contrast=c("grp", "S", "S-SN38"), tidy = TRUE)

data munging to extract differentially expressed genes from the above 2 comparisons The log fold change(lfc) and pval thresholds are arbitrarily chosen here. Some work needs to be done here to choose meaningful cut-offs based on empirical evidence in this expt.

tab_WT_WTSN38 <- data.frame(logFC = res_WT_WTSN38$log2FoldChange, negLogPval = -log10(res_WT_WTSN38$padj))

tab_S_SSN38 <- data.frame(logFC = res_S_SSN38$log2FoldChange, negLogPval = -log10(res_S_SSN38$padj))

pval <- 0.01

lfc <- 1.0

signGenes_WT_WTSN38 <- (abs(tab_WT_WTSN38$logFC) > lfc & tab_WT_WTSN38$negLogPval > -log10(pval))

signGenes_S_SSN38 <- (abs(tab_S_SSN38$logFC) > lfc & tab_S_SSN38$negLogPval > -log10(pval))

DEG_WT_WTSN38 <- res_WT_WTSN38[signGenes_WT_WTSN38,]

DEG_S_SSN38 <- res_S_SSN38[signGenes_S_SSN38,]

volcano plots between the WT vs. WT-SN38 and S vs. S-SN38 comparisons for DE Consider changing the lfc and pval cut-offs in the code-block above, to play around with cut-offs.

# plot both comparisons


volcanoplot <- function(tab, signGenes) {
  
    plot(tab, pch = 16, cex = 0.6, xlab = expression(log[2]~fold~change), ylab = expression(-log[10]~pvalue))
  
    # highlight over-expressed genes in red
  
    points(tab[signGenes, ], pch = 16, cex = 0.6, col = "red")
   
    # Indicate the pval cut-off as a green line

    abline(h = -log10(pval), col = "green3", lty = 2)

    # Indicate the fold change cut-offs as a blue line

    abline(v = c(-lfc, lfc), col = "blue", lty = 2)

    mtext(paste("pval =", pval), side = 4, at = -log10(pval), cex = 0.4, line = 0.3, las = 1)
    mtext(c(paste("-", lfc, "fold"), paste("+", lfc, "fold")), side = 3, at = c(-lfc, lfc), cex = 0.5, line = 0.6)

  
}

par(mfrow=c(1,2))
volcanoplot(tab_WT_WTSN38, signGenes_WT_WTSN38)
volcanoplot(tab_S_SSN38, signGenes_S_SSN38)

catenate gene-names to the DEG_S_SSN38 and DEG_WT_WTSN38 dataframes & sort in decreasing order of fold change

GeneNamecount_data <- as.matrix(read.csv("/fcrbiouatappn01/home/ssunkara1/SCL000091/SCL000091_genes.count.txt",sep="\t"))

genenameDF <- GeneNamecount_data[,c("Gene_ID", "Gene_Name")]

DEG_WT_WTSN38 = merge(x=DEG_WT_WTSN38,y=genenameDF,by.x="row", by.y="Gene_ID")

DEG_WT_WTSN38 <- DEG_WT_WTSN38[order(-DEG_WT_WTSN38$log2FoldChange),]

DEG_S_SSN38 = merge(x=DEG_S_SSN38,y=genenameDF,by.x="row", by.y="Gene_ID")

DEG_S_SSN38 <- DEG_S_SSN38[order(-DEG_S_SSN38$log2FoldChange),]

Extract Genes that are common to both comparisons from above WT vs. WT-SN38 has more differentially expressed genes than the S vs. S-SN38

nrow(DEG_WT_WTSN38)
## [1] 3575
nrow(DEG_S_SSN38)
## [1] 2111
DEG_commongenes <- DEG_WT_WTSN38[DEG_WT_WTSN38$row %in% DEG_S_SSN38$row, ]
nrow(DEG_commongenes)
## [1] 1118

R function to generate genelist with log fold change.

genelistfunc <- function(DEGdf) {
  
  genes_lfc <- DEGdf$log2FoldChange
  DEGdf$row <- gsub("\\..*", '', DEGdf$row)
  names(genes_lfc) <- DEGdf$row
  list(genes_lfc, DEGdf)
    
}

Data frames and gene-lists for WT, SN38 and common-genes

c(commongenes_lfc, DEG_commongenes) %<-% genelistfunc(DEG_commongenes)

c(WT_WTSN38_lfc, DEG_WT_WTSN38) %<-% genelistfunc(DEG_WT_WTSN38)

c(S_SSN38_lfc, DEG_S_SSN38) %<-% genelistfunc(DEG_S_SSN38)

GO gene enrichment analysis and over-representation analysis Analysis is done using the ClusterProfiler package. Feel free to reevaluate enriched genes below, based on other ontology types like “BP” and “CC”.

GOenrichfunc <- function(genes_lfc, DEGdf) {
  
  # Enrichment analysis
  
  gse <- gseGO(geneList=genes_lfc, ont ="MF", keyType = "ENSEMBL", minGSSize = 3, maxGSSize = 800, pvalueCutoff = 0.05, verbose = TRUE, OrgDb = "org.Hs.eg.db", pAdjustMethod = "none", nPermSimple = 10000)
  
  # Over representation analysis
  
  overrep <- enrichGO(gene = DEGdf$row, keyType = "ENSEMBL", OrgDb = 'org.Hs.eg.db', ont = "MF", pAdjustMethod = "none", pvalueCutoff  = 0.05, qvalueCutoff  = 0.05, readable = TRUE)
  
  list(gse, overrep)
  
}

c(gse_commongenes, overrep_commongenes) %<-% GOenrichfunc(commongenes_lfc, DEG_commongenes)
c(gse_WT, overrep_WT) %<-% GOenrichfunc(WT_WTSN38_lfc, DEG_WT_WTSN38)
c(gse_S, overrep_S) %<-% GOenrichfunc(S_SSN38_lfc, DEG_S_SSN38)

# plot dotplots for enrichement analysis. Feel free to test other comparisons, for example: WT or SN38 

dotplot(gse_commongenes, showCategory=10, font.size=6, split=".sign") + facet_grid(.~.sign)

R function for pathway analysis. Pathway analysis using the gseKEGG function from clusterProfiler. clusterProfiler supports direct access to the latest KEGG database. KEGG analysis needs the gene_ids in Entrez format. Remapping the gene ids from Ensembl to Entrez format below. (Note that the ENSEMBL_id <-> Entrez_id conversion is not 100%). In this case, <1% of the common_genes with an Ensembl_id do not have an Entrez id.

pathwayfunc <- function(genelist_lfc, DEGdf) {
  
  # map ENSEMBL to ENTREZIDs.
  ids <- bitr(names(genelist_lfc), fromType = "ENSEMBL", toType = "ENTREZID", OrgDb = "org.Hs.eg.db")
  
  # dedup the ENSEMBL ids
  dedup_ids <- ids[!duplicated(ids[c("ENSEMBL")]),]

  # DEG dataframe with the deduped Ensembl ids
  DEGdf <- DEGdf[DEGdf$row %in% dedup_ids$ENSEMBL,]
  
  # add Entrez_ids to the DEGdf dataframe
  DEGdf$ENTREZID = dedup_ids$ENTREZID
  
  # generate kegg gene list with ENTREZ IDs and fold changes from the deduped DEGdf
  kegg_gene_list <- DEGdf$log2FoldChange
  names(kegg_gene_list) <- DEGdf$ENTREZID
  
  # na shouldn't be an issue, but just in case. Follow by sort
  kegg_gene_list<-na.omit(kegg_gene_list)
  kegg_gene_list <- sort(kegg_gene_list, decreasing = TRUE)
  
  # Look for over-represented pathways based on gene-names. 
  
  genes <- names(kegg_gene_list)
  represented_pathway <- enrichKEGG(gene = genes, organism = 'hsa', pvalueCutoff = 0.05)
  
  # Enrichment analysis using the gseKEGG function (leverages log2FC, and not just gene names).
  enriched_pathway <- gseKEGG(geneList = kegg_gene_list, organism = 'hsa', nPermSimple = 10000, minGSSize =3, maxGSSize = 800, pvalueCutoff = 0.05, pAdjustMethod = "none", keyType = "ncbi-geneid")
  
  # return the over-represented pathway, enriched pathway, modified dataframe as well as the sorted gene list with entrez ids.
  list(represented_pathway, enriched_pathway, DEGdf, kegg_gene_list)
  
}

# run the above function for common-genes, WT DE genes, and SN38 DE genes
c(commongenes_Opathway, commongenes_pathway, DEG_commongenes, ENTREZ_commongenes) %<-% pathwayfunc(commongenes_lfc, DEG_commongenes)

c(WT_WTSN38_Opathway, WT_pathway, DEG_WT_WTSN38, ENTREZ_WT_WTSN38) %<-% pathwayfunc(WT_WTSN38_lfc, DEG_WT_WTSN38)

c(S_SSN38_Opathway, S_pathway, DEG_S_SSN38, ENTREZ_S_SSN38) %<-% pathwayfunc(S_SSN38_lfc, DEG_S_SSN38)

# You may want to look for over-represented pathways from the other comparisons, simply by changing out the *_Opathway variable below:

head(commongenes_Opathway, n=30) %>% as_tibble()
## # A tibble: 15 × 9
##    ID       Description     GeneR…¹ BgRatio  pvalue p.adj…²  qvalue geneID Count
##    <chr>    <chr>           <chr>   <chr>     <dbl>   <dbl>   <dbl> <chr>  <int>
##  1 hsa04010 MAPK signaling… 35/467  294/81… 2.59e-5 0.00514 0.00412 26281…    35
##  2 hsa04310 Wnt signaling … 24/467  170/81… 3.31e-5 0.00514 0.00412 8313/…    24
##  3 hsa04330 Notch signalin… 12/467  59/8170 1.01e-4 0.0104  0.00834 55534…    12
##  4 hsa04020 Calcium signal… 28/467  240/81… 2.36e-4 0.0171  0.0137  26281…    28
##  5 hsa05226 Gastric cancer  20/467  149/81… 2.97e-4 0.0171  0.0137  26281…    20
##  6 hsa04060 Cytokine-cytok… 32/467  295/81… 3.31e-4 0.0171  0.0137  1235/…    32
##  7 hsa04925 Aldosterone sy… 15/467  98/8170 4.10e-4 0.0182  0.0146  57118…    15
##  8 hsa04929 GnRH secretion  11/467  64/8170 8.92e-4 0.0337  0.0270  776/5…    11
##  9 hsa04971 Gastric acid s… 12/467  76/8170 1.15e-3 0.0337  0.0270  3784/…    12
## 10 hsa04973 Carbohydrate d… 9/467   47/8170 1.17e-3 0.0337  0.0270  776/5…     9
## 11 hsa04921 Oxytocin signa… 19/467  154/81… 1.19e-3 0.0337  0.0270  5530/…    19
## 12 hsa04360 Axon guidance   21/467  182/81… 1.60e-3 0.0385  0.0308  10371…    21
## 13 hsa05410 Hypertrophic c… 13/467  90/8170 1.70e-3 0.0385  0.0308  1674/…    13
## 14 hsa05224 Breast cancer   18/467  147/81… 1.74e-3 0.0385  0.0308  26281…    18
## 15 hsa04625 C-type lectin … 14/467  104/81… 2.27e-3 0.0469  0.0376  5530/…    14
## # … with abbreviated variable names ¹​GeneRatio, ²​p.adjust
# dotplot of the enriched kegg pathways
# dotplots for enriched kegg pathways from other comparisons could similarly be visualized by changing out the *_pathway variable.

dotplot(commongenes_pathway, showCategory = 10, font.size = 6, title = "Enriched Pathways" , split=".sign") + facet_grid(.~.sign)

Specific pathway visualization You maybe interested in reviewing any of the KEGG pathways closely for useful interrogation. The pathview functionality offers you the ability to view upregulated and downregulated genes within the context of the specific pathway.

# reviewing the top hit (MAPK signaling pathway) in the KEGG over representation analysis.
# This will bring up a separate window.

pathview(gene.data = ENTREZ_commongenes, pathway.id = "hsa04010", species = "hsa")

KEGG, REACTOME, and WIKI Pathway over representation analysis of the common gene set with the gprofiler2 package The KEGG pathway results should come out to be the same with the clusterprofiler package results. Differences between KEGG and REACTOME pathway results are because of underlying differences in the databases.

gostres <- gost(DEG_commongenes$row, organism = "hsapiens", ordered_query = TRUE,multi_query = TRUE, significant = TRUE, exclude_iea = FALSE,measure_underrepresentation = FALSE,evcodes = TRUE, source = c("KEGG", "REAC", "WP"), user_threshold = 0.05, as_short_link = FALSE, domain_scope="known", correction_method = "g_SCS")

# View the top 30 hits

head(gostres$result, n=30) %>% as_tibble()
## # A tibble: 30 × 11
##    term_id        p_val…¹ signi…² term_…³ query…⁴ inter…⁵ source term_…⁶ effec…⁷
##    <chr>          <list>  <list>    <int> <list>  <list>  <chr>  <chr>     <int>
##  1 REAC:0000000   <dbl>   <lgl>     10770 <int>   <int>   REAC   REACTO…   61498
##  2 KEGG:00000     <dbl>   <lgl>      7559 <int>   <int>   KEGG   KEGG r…   61498
##  3 WP:000000      <dbl>   <lgl>      7364 <int>   <int>   WP     WIKIPA…   61498
##  4 REAC:R-HSA-16… <dbl>   <lgl>      2523 <int>   <int>   REAC   Signal…   61498
##  5 KEGG:01100     <dbl>   <lgl>      1457 <int>   <int>   KEGG   Metabo…   61498
##  6 REAC:R-HSA-14… <dbl>   <lgl>      2075 <int>   <int>   REAC   Metabo…   61498
##  7 KEGG:05200     <dbl>   <lgl>       517 <int>   <int>   KEGG   Pathwa…   61498
##  8 REAC:R-HSA-16… <dbl>   <lgl>      1694 <int>   <int>   REAC   Disease   61498
##  9 REAC:R-HSA-16… <dbl>   <lgl>      2041 <int>   <int>   REAC   Immune…   61498
## 10 REAC:R-HSA-59… <dbl>   <lgl>      1414 <int>   <int>   REAC   Post-t…   61498
## # … with 20 more rows, 2 more variables: source_order <int>, parents <list>,
## #   and abbreviated variable names ¹​p_values, ²​significant, ³​term_size,
## #   ⁴​query_sizes, ⁵​intersection_sizes, ⁶​term_name, ⁷​effective_domain_size

Interactive plot from the above dataset

gostplot(gostres, interactive = TRUE, capped = TRUE)

More pathway over-representation analysis gprofiler2 and msigdb share KEGG, REACTOME and WIKI PATHWAYS in common. msigdb additionally has Biocarta and PID pathway databases. To lookup pathway representations in either of those databases, these are the commands. For the common genes, no pathways were detected in either biocarta or the PID datasets

## Extract gene sets from the category C2, subcategory BIOCARTA from msigdb
## msigdbr_collections() to look up the categories and subcategories

biocarta_gene_sets = msigdbr(species = "human", category = "C2", subcategory = "CP:BIOCARTA")

## Generate a data frame with entrez ids
biocarta_t2g <- biocarta_gene_sets %>% dplyr::distinct(gs_name, entrez_gene) %>% as.data.frame()

## Look for enriched pathways in the common gene list from the Biocarta pathway database
## List should come back empty.

enrichedBiocarta <- enricher(gene = names(ENTREZ_commongenes), TERM2GENE = biocarta_t2g)
enrichedBiocarta$ID %>% as_tibble()
## # A tibble: 0 × 1
## # … with 1 variable: value <chr>

For Transcription factor motif over-representation as well as network analysis, the ChEA3 package is chosen

library(httr)
library(jsonlite)

url <- "https://maayanlab.cloud/chea3/api/enrich/"
encode <- "json"
payload <- list(query_name = "TFBSQuery", gene_set = DEG_commongenes$Gene_Name)

#POST to ChEA3 server

response <- POST(url = url, body = payload, encode = encode)
json <- content(response, "text")

#results as list of R dataframes
results <- fromJSON(json)

# recommendation is to look at the Integrated meanRank results, because that metric performed the best in their benchmark evaluation

results$`Integrated--meanRank` %>% as_tibble()
## # A tibble: 1,632 × 6
##    `Query Name` Rank  TF      Score Library                              Overl…¹
##    <chr>        <chr> <chr>   <chr> <chr>                                <chr>  
##  1 TFBSQuery    1     BHLHE22 24.0  ARCHS4 Coexpression,29;ReMap ChIP-s… EXOC3L…
##  2 TFBSQuery    2     CXXC5   32.0  ARCHS4 Coexpression,15;GTEx Coexpre… KCNC3,…
##  3 TFBSQuery    3     ZNF385B 42.33 ARCHS4 Coexpression,48;Enrichr Quer… STEAP4…
##  4 TFBSQuery    4     ZBTB18  49.5  ARCHS4 Coexpression,7;GTEx Coexpres… ATP8A1…
##  5 TFBSQuery    5     ETV1    52.0  ARCHS4 Coexpression,31;Enrichr Quer… IPO13,…
##  6 TFBSQuery    6     MKX     59.67 ARCHS4 Coexpression,71;Enrichr Quer… NRP1,C…
##  7 TFBSQuery    7     NR4A3   65.67 ARCHS4 Coexpression,45;Enrichr Quer… CSRNP1…
##  8 TFBSQuery    8     MYT1    70.33 ARCHS4 Coexpression,115;Enrichr Que… CDKN1A…
##  9 TFBSQuery    9     NKX63   70.5  ARCHS4 Coexpression,90;GTEx Coexpre… RAB3B,…
## 10 TFBSQuery    10    ZNF488  72.5  ARCHS4 Coexpression,17;Enrichr Quer… RAB3B,…
## # … with 1,622 more rows, and abbreviated variable name ¹​Overlapping_Genes